Student: Jorge Augusto Salgado Salhani
import numpy as np
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import warnings
warnings.filterwarnings("ignore")
In order to read each file containing the edge lists in txt and gml format, we can build the two functions below
def read_gml_and_Gcc(path, nodetype=int):
"""
FUNCTION:
Read a file gml type that contains node connections
and their weights as float.
PARAMETERS:
path: file path as string located in computer
RETURN:
tuple: (pos, G), representing the position object
(networkx as nx, spring_layout()) for draw purpose and
the Graph G itself made from file
"""
G = nx.read_gml(path)
G = nx.convert_node_labels_to_integers(G, first_label=0)
labels = G.nodes()
pos = nx.spring_layout(G)
G = G.to_undirected()
G.remove_edges_from(G.selfloop_edges())
Gcc = sorted(nx.connected_component_subgraphs(G), key=len, reverse=True)
G = Gcc[0]
return pos, G
def read_txt_and_Gcc(path, comment='%', nodetype=int):
"""
FUNCTION:
Read a file that contains node connections and their
weights as float. Example of readable file
---------
%This is an example file.
%NodeA NodeB Weight
0 1 0.5
1 3 2.0
2 3 1.2
---------
Comments can be marked as wish ('%' is set as default)
PARAMETERS:
path: file path as string located in computer
comment: char (or string) used in file to mark comments
RETURN:
tuple: (pos, G), representing the position object
(networkx as nx, spring_layout()) for draw purpose and
the Graph G itself made from file
"""
G = nx.read_edgelist(path, comments=comment, nodetype=nodetype, data=(('weight', float),))
G = nx.convert_node_labels_to_integers(G, first_label=0)
labels = G.nodes()
pos = nx.spring_layout(G)
G = G.to_undirected()
G.remove_edges_from(G.selfloop_edges())
Gcc = sorted(nx.connected_component_subgraphs(G), key=len, reverse=True)
G = Gcc[0]
return pos, G
Now we must store each networkx graph separately
pos_euro, G_euroroad = read_txt_and_Gcc('/home/jorge/Documents/redes_complexas/subelj_euroroad/out.subelj_euroroad_euroroad')
pos_hamst, G_hamster = read_txt_and_Gcc('/home/jorge/Documents/redes_complexas/petster-friendships-hamster/out.petster-friendships-hamster-uniq')
For C. elegans, in particular, we have repeated edges connected, so we altered the original file and inserted 'multigraph 1' in its header. Therefore, we can read it normally
G_celegans = nx.Graph()
pos_celegans, G_celegans = read_gml_and_Gcc('/home/jorge/Documents/redes_complexas/celegansneural/celegansneural.gml')
G_celegans = nx.Graph(G_celegans)
pos_usair, G_usair = read_txt_and_Gcc('/home/jorge/Documents/redes_complexas/US_largest500_airportnetwork.txt')
Random Walk Accessibility is one of the measures we want to take, so we must build a function to this purpose
def build_transit_probab_matrix(G):
"""
FUNCTION:
Build the Transition Probability Matrix
PARAMETERS:
G: networkx graph object
RETURN:
numpy matrix 2x2: Transition probability matrix address
"""
#Firstly, we can build a null vector in order to store
#each element from our probability matrix
adj_matrix = nx.to_numpy_matrix(G, nodelist=list(G.nodes()))
total_nodes = len(adj_matrix)
P_matrix = np.zeros((total_nodes, total_nodes))
#These loop will set the principal diagonal as zero
#and divide all other elements byt the sum of each
#row (normalizing each of them)
for i in range(total_nodes):
for j in range(total_nodes):
if adj_matrix[i,j] == 0:
P_matrix[i,j] = 0
else:
P_matrix[i,j] = adj_matrix[i,j]/np.sum(adj_matrix[i])
return P_matrix
def accessibility(G):
"""
FUNCTION:
Calculates the nodes accessibility of a graph ia
matrix exponential
PARAMETERS:
G: networkx graph object
RETURN:
"""
import scipy.linalg as alg
N = len(G.nodes())
P = build_transit_probab_matrix(G)
P2 = alg.expm(P)/np.exp(1)
vacc = np.zeros(N, dtype=float)
for i in np.arange(0, N):
acc = 0
for j in np.arange(0, N):
if P2[i,j] > 0:
acc += P2[i,j]*np.log(P2[i,j])
acc = np.exp(-acc)
vacc[i] = acc
return vacc
Finally, to facilitate the correlation matrix plot, we can also define a function which calculates all the measurements we want given a networkx graph
In order to get explanations easier, here we are going to show all measurements and their associated equations.
Closeness centrality measure is defined as the average distance between each i-th node and all others. So, given a distance matrix $\textit{D}$ with elements $\textit{D}_{ij}$, this measurement is defined as
$$ \begin{equation} C_i = N \Bigg[\sum_{j=1, j\neq i}^N d_{ij}\Bigg]^{-1} \end{equation} $$where $\textit{d}_{ij}$ is the shortest path between nodes i and j.
Betweenness centrality considers all the possible shortest path (not only the average, for example) and consider, for the i-th node, the ones over which the path go through i. Therefore, considering $\eta (a,b)$ the total count number of shortest paths between nodes a and b and $\eta (a,i,b)$ the ones that passes through node i,
$$ \begin{equation} B_i = \sum_{a,b} \frac{\eta(a,b)}{\eta(a,i,b)}. \end{equation} $$Another measurable which correlates to betweenness centrality considers random walk particles. In this formulation,
$$ \begin{equation} B_i = \sum_{a=1}^N \sum_{b=1}^N w(a,i,b), \end{equation} $$where $\textit{w}(a,i,b)$ stands for the times node i is visited. Once it is stochastic, the number of steps considered and the average visitation will varied.
Eigenvector centrality is defined by a vector $\textit{x}$ of which each element $\textit{x}_i$ represents a quantity that reflects the weights of all nodes that connects with it. Hence
$$ \begin{equation} x_i = \frac{1}{\lambda} \sum_{k = 1}^N A_{i,k} x_k, \end{equation} $$where $\textit{A}_{i,k}$ represents the adjacency matrix elements and $\lambda$ is a constant (eigenvalue). Each $\textit{x}_i$ is called i-th node eigenvector centrality.
For $\lambda$ we consider the highest value from all eigenvalues (Perron-Frobenius theorem). The vector x can be also finded by power method, where
$$ \begin{equation} x^{(k)} = x^{(k-1)} A. \end{equation} $$In case we let $\textit{x}^{(0)} = [1,1, \ldots, 1]$, $\textit{x}^{(1)}$ represents the degree of each node and $\textit{x}^{(n)}$, the number of paths that leads to i-th node after n steps. After perform several calculations for power method, we can put the value for $\lambda$ as the highest value found (like normalization constant).
PageRank measurement also considers random walk, however now we consider the transit probability matrix $\textit{P}$ such that each element $\textit{P}_{i,j}$ is defined as
$$ \begin{equation} P_{i,j} = A_{i,j} \Bigg[\sum_{j=1}^N A_{i,j}\Bigg]^{-1}, \end{equation} $$where $\textit{A}$ is the adjacency matrix.
Then we can define the eigenvectors $\pi$ and the Google matrix G such that
$$ \begin{equation} \pi^T = \pi^T G \end{equation} $$and
$$ \begin{equation} G = \kappa \Bigg(P + \frac{a e^T}{N}\Bigg) +\frac{\big(1-\kappa\big)}{N}e e^T \end{equation} $$This measurement divide all Graph into smaller ones, removing iteratively the nodes that have fewer than k connections. Therefore, it halts only when all subgraphs have only degree k. When halted, all i-th remanescent nodes have kc(i) = k. The most central nodes will have the highest kc(i)
Such as the two last measurements, this one considers random walk movements. Formally, the i-th node accessibility is given by
$$ \begin{equation} \alpha_h (i) = \exp{\Bigg[\sum_{j=1}^N P_{ij}^{(h)} \ln{\big(P_{ij}^{(h)}\big)}\Bigg]} \end{equation} $$where $P_{ij}^{(h)}$ stands for the probability of going from i to j by a path of length h.
If all nodes cannot be reached, $\alpha_h(i) = 1$. On the other hand, if all have the same probability of being reached, $P_{ij} = 1/N$ for all pairs (i,j), and the sum equals to $\ln(N)$, implying that $\alpha_h(i) = N$.
In order to avoid the path length h parameter, it is defined the $\textit{generalized random walk accessibility}$
$$ \begin{equation} \alpha(i) = \exp{\Bigg[ \sum_{j=1}^N\textbf{P}_{ij} \ln{\big( \textbf{P}_{ij}\big)} \Bigg]} \end{equation} $$where $\textbf{P}$ is given by
$$ \begin{align} \textbf{P} &= \frac{1}{e} \sum_{k=0}^\infty \frac{1}{k!}P^k \\ &= e^P, \end{align} $$where $\textit{P}$ is the whole transit probability matrix and $e^P$ is set to be $\textit{matrix exponential}$. Here it is not considered any weight associated to each node, so every connection have the same probability to be followed.
def build_matrix_correl(G):
"""
FUNCTION
Calculate several measurements to facilitate their comparison in a correlation matrix
PARAMETERS:
G: networkx graph object
RETURN:
pandas-library dataframe object containing
{'K':vk,'CLC':CLC,'B':B,'EC':EC,'PR':PR,'KC':KC, 'ACC':ACC},
which stands for the distributions of:
K - degree
CLC - clustering coefficient
B - Betweenness centrality
EC - Eigenvector centrality
PR - PageRank
KC - Core number
ACC - Random Walk Accessibility
"""
vk = dict(G.degree())
vk = list(vk.values())
CLC = dict(nx.closeness_centrality(G))
CLC = list(CLC.values())
B = dict(nx.betweenness_centrality(G))
B = list(B.values())
EC = dict(nx.eigenvector_centrality(G, max_iter=1000))
EC = list(EC.values())
PR = dict(nx.pagerank(G, alpha=0.85))
PR = list(PR.values())
KC = dict(nx.core_number(G))
KC = list(KC.values())
ACC = accessibility(G)
df = pd.DataFrame({'K':vk,'CLC':CLC,'B':B,'EC':EC,'PR':PR,'KC':KC, 'ACC':ACC})
print(df.head())
return df
def plot_matrix_correl(dataFrame):
"""
FUNCTION:
Plot the correlation matrix for several quantities
PARAMETERS:
dataFrame: pandas-library dataFrame object
RETURN:
None
"""
corr = dataFrame.corr()
plt.figure(figsize=(8,8))
plt.imshow(corr, cmap='Blues', interpolation='none',
aspect='auto')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns, rotation='vertical',
fontsize=13)
plt.yticks(range(len(corr)), corr.columns, fontsize=13)
plt.suptitle('Correlation between Centrality Measures',
fontsize=15)
plt.grid(False)
plt.show(True)
With all these functions, we can plot the correlation matrix for all networks for all measures we want
One way to see the possibility of correlation between two measurements consists on making the network with node colors varing with some measure. Therefore, we can build a function in order to facilitate its usage for all correlation study
def plot_network_measure_map(G, pos, dataFrame, measure_acronym, title):
"""
FUNCTION:
Plot a graph with matplotlib of the graph G and how the measurement of each node
varies within the range of this same measure. It also displays a colorbar on graphic lateral.
PARAMETERS:
G: Networkx graph object
pos: Networkx position layout object
measure_acronym: Acronym used to define the measurement that will be considered
as variable through the network. It names the collumn in a pandas
dataframe
title: Title of the graph plot
RETURN:
No return value
"""
data = list(dataFrame[measure_acronym][i] for i in range(len(dataFrame)))
vmin = dataFr[measure_acronym].min()
vmax = dataFr[measure_acronym].max()
plt.figure(figsize=(15,10))
cmap = matplotlib.cm.get_cmap('cool')
normalize = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
colors = [cmap(normalize(value)) for value in data]
sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin = vmin, vmax=vmax))
sm._A = []
plt.colorbar(sm)
nx.draw_networkx(G, node_size=20, labels={}, pos=pos,
node_color=colors)
plt.suptitle(title, fontsize=20)
plt.show(True)
plt.figure(figsize=(15,10))
nx.draw_networkx(G_euroroad, pos=pos_euro, node_size=20, labels={})
plt.show(True)
dataFr = build_matrix_correl(G_euroroad)
plot_matrix_correl(dataFr)
As we can see by the correlation matrix, the highest correlations occur between K (degree) and PR (PageRank). Since PageRank is associated with how many times the i-th node is visited, it does make sense that the greater its degree, the bigger is the probability to a random walker arrive at it.
On the other hand, one of the lowest correlation occurs between CLC (Closeness Centrality) and PR (PageRank). Once Closeness Centrality indicates the average distance of shortest path between i-th node and all the others and for a road map there is no shortcuts between low-connected (low PR) and high-connected (high PR), so in order to get to a high PR j-th node it does not mean that the path from i to j is big (positively correlated) or small (negatively correlated). On a more realistic view, bigger cities are not necessarily at the center (low CLC) or at the borders (high CLC) of a state/country. Their geographic position is not correlated with their size, for instance.
plot_network_measure_map(G_euroroad, pos_euro, dataFr, 'PR', 'Euroroad map. PageRank variation')
plot_network_measure_map(G_euroroad, pos_euro, dataFr, 'CLC', 'Euroroad map. Closeness Centrality variation')
As we can see, there is no correlation between PageRank and Closeness Centrality. On the otherside... for degree and PageRank (below), the correlation can be seen clearer.
plot_network_measure_map(G_euroroad, pos_euro, dataFr, 'K', 'Euroroad map. Degree variation')
import seaborn as sns
sns.pairplot(dataFr)
plt.show(True)
plt.figure(figsize=(15,10))
nx.draw_networkx(G_hamster, pos=pos_hamst, node_size=10, labels={})
plt.show(True)
dataFr = build_matrix_correl(G_hamster)
plot_matrix_correl(dataFr)
By this correlation matrix, once again PageRank (PR) and degree (K) presented the higher correlation (as Euroroad map). The same arguments still hold. Nevertheless, Betweenness Centrality (B) and K-core (KC) are one of the lowest correlated measurements on this network.
Betweenness Centrality for i-th node can be understand as the total number of paths between two nodes that passes through node i, and it does not correlates with K-core (thought as how deeply a node is buried inside the network, or, in other words, is connected with high degree neighbors) because this network is highly connected (it differs from Euroroad by the connections linear nature. Friends connections are not limited by a '2D geographic plane'), which turns out that makes all nodes with similar B but not necessarily similar KC.
The comparison can be glimpsed by comparing the plotted networks below. Firstly we will compare PR and K, and, after, B with KC
plot_network_measure_map(G_hamster, pos_hamst, dataFr, 'K', 'Hamster\'s pet friends. Degree variation')
plot_network_measure_map(G_hamster, pos_hamst, dataFr, 'PR', 'Hamster\'s pet friends. PageRank variation')
plot_network_measure_map(G_hamster, pos_hamst, dataFr, 'B', 'Hamster\'s pet friends. Betweenness Centrality variation')
plot_network_measure_map(G_hamster, pos_hamst, dataFr, 'KC', 'Hamster\'s pet friends. K-core variation')
sns.pairplot(dataFr)
plt.show(True)
plt.figure(figsize=(15,10))
nx.draw_networkx(G_celegans, pos=pos_celegans, node_size=10, labels={})
plt.show(True)
dataFr = build_matrix_correl(G_celegans)
plot_matrix_correl(dataFr)
Once again the highest correlation occurs on PageRank (PR) and Degree (K). The explanation holds as before.
The lowest correlation, like the Hamster's petshop friends, happens on Betweenness Centrality (B) and K-core (KC). The argument is still the same, i.e., the network is higly connected, with low variation on B and presents a dense region with inner nodes that connects with highly connected neighbors.
As before, those conditions can be seen with the networks below. First we present the high correlated measurements and, after, the low correlated ones.
plot_network_measure_map(G_celegans, pos_celegans, dataFr, 'K', 'C. elegans neural network. Degree variation')
plot_network_measure_map(G_celegans, pos_celegans, dataFr, 'PR', 'C. elegans neural network. PageRank variation')
plot_network_measure_map(G_celegans, pos_celegans, dataFr, 'KC', 'C. elegans neural network. K-core variation')
plot_network_measure_map(G_celegans, pos_celegans, dataFr, 'B', 'C. elegans neural network. Betweenness Centrality variation')
sns.pairplot(dataFr)
plt.show(True)
plt.figure(figsize=(15,10))
nx.draw_networkx(G_usair, pos=pos_usair, node_size=10, labels={})
plt.show(True)
dataFr = build_matrix_correl(G_usair)
plot_matrix_correl(dataFr)
Finally, for the US airport network, we got that the highest correlation happens between Degree (K) and Eigenvector Centrality (EC). Once EC reflects the importance of a node, which turns out to be the ones that have the most connections, it makes sense that on this network, K and EC have the highest correlation. Since airports are not confined on a 2D plane (like road map), small airports do not need to make several connections until link to bigger ones (which, not coincidently, have huge amount of links), increasing their (the big ones) importance.
One of the lowest correlations happens when comparing K-core (KC) and Betweenness Centrality (B). The argument is the same as for C. elegans and Hamster's petshop networks (i.e. the network is still higly connected, with low variation on B and presents a dense region with inner nodes that connects with highly connected neighbors).
Also, those conditions can be seen with the networks below. First we present the high correlated measurements and, after, the low correlated ones.
plot_network_measure_map(G_usair, pos_usair, dataFr, 'K', 'US airport network. Degree variation')
plot_network_measure_map(G_usair, pos_usair, dataFr, 'EC', 'US airport network. Eigenvector Centrality variation')
plot_network_measure_map(G_usair, pos_usair, dataFr, 'B', 'US airport network. Betweenness Centrality variation')
plot_network_measure_map(G_usair, pos_usair, dataFr, 'KC', 'US airport network. K-core variation')
sns.pairplot(dataFr)
plt.show(True)
On this question, we must first build a function that will handle with .csv files, since it is the cities extensions.
def read_csv_and_Gcc(path, source='u', target='v', nodetype=int):
"""
FUNCTION:
Read a file gml type that contains node connections
and their weights as float.
PARAMETERS:
path: file path as string located in computer
RETURN:
tuple: (pos, G), representing the position object
(networkx as nx, spring_layout()) for draw purpose and
the Graph G itself made from file
"""
import pandas as pd
df = pd.read_csv(path)
df.head()
G = nx.from_pandas_edgelist(df, source=source, target=target)
G = nx.convert_node_labels_to_integers(G, first_label=0)
labels = G.nodes()
pos = nx.spring_layout(G)
G = G.to_undirected()
G.remove_edges_from(G.selfloop_edges())
Gcc = sorted(nx.connected_component_subgraphs(G), key=len, reverse=True)
G = Gcc[0]
return pos, G
Now, we can use this function to store each network
pos_washing, G_washing = read_csv_and_Gcc('/home/jorge/Downloads/12-FL-counties-street_networks-node_edge_lists/12133_Washington_County/edge_list.csv')
pos_stJohn, G_stJohn = read_csv_and_Gcc('/home/jorge/Downloads/12-FL-counties-street_networks-node_edge_lists/12109_St._Johns_County/edge_list.csv')
pos_hawaii, G_hawaiiStreet = read_csv_and_Gcc('/home/jorge/Downloads/15-HI-urbanized_areas-street_networks-node_edge_lists/89770_Urban_Honolulu_HI_Urbanized_Area/edge_list.csv')
Once those are huge networks, instead of using the already-built functions (time spent would be big), we are going to use only those network metrics that are asked for
K = [] #Stores Degree
B = [] #Stores Eigenvector Centrality
CLC = [] #Stores Closenness Centrality
for i in range(3):
print(i)
if i == 0:
G_i = G_washing
elif i == 1:
G_i = G_stJohn
else:
G_i = G_hawaiiStreet
vk = dict(G_i.degree())
vk = list(vk.values())
K.append(vk)
clc = dict(nx.closeness_centrality(G_i))
clc = list(clc.values())
CLC.append(clc)
b = dict(nx.betweenness_centrality(G_i))
b = list(b.values())
B.append(b)
print('Done!')
Now we can make the histograms of those metrics and compare each of them.
plt.figure(1, figsize=(15,5))
plt.subplot(1,3,1)
plt.hist(K[0], bins=20, density=True, label="Washington", histtype='step', linewidth=1.4)
plt.hist(K[1], bins=20, density=True, label="St. John", histtype='step', linewidth=1.4)
plt.hist(K[2], bins=20, density=True, label="Hawaii", histtype='step', linewidth=1.4)
plt.legend();
plt.subplot(1,3,2)
plt.hist(CLC[0], bins=50, density=True, label="Washington", histtype='step', linewidth=1.4)
plt.hist(CLC[1], bins=50, density=True, label="St. John", histtype='step', linewidth=1.4)
plt.hist(CLC[2], bins=50, density=True, label="Hawaii", histtype='step', linewidth=1.4)
plt.legend();
plt.subplot(1,3,3)
plt.hist(B[0], bins=50, density=True, label="Washington", histtype='step', linewidth=1.4)
plt.hist(B[1], bins=50, density=True, label="St. John", histtype='step', linewidth=1.4)
plt.hist(B[2], bins=50, density=True, label="Hawaii", histtype='step', linewidth=1.4)
plt.xlim(0,0.15)
plt.legend();
Once we cannot analise the last plot (betweenness centrality), we will make the same set of graphics with logarithm scale for this one
plt.figure(1, figsize=(15,5))
plt.subplot(1,3,1)
plt.title('Degree distribution', fontsize=15)
plt.hist(K[0], bins=20, density=True, label="Washington", histtype='step', linewidth=1.4)
plt.hist(K[1], bins=20, density=True, label="St. John", histtype='step', linewidth=1.4)
plt.hist(K[2], bins=20, density=True, label="Hawaii", histtype='step', linewidth=1.4)
plt.legend();
plt.subplot(1,3,2)
plt.title('Closenness Centrality distribution', fontsize=15)
plt.hist(CLC[0], bins=50, density=True, label="Washington", histtype='step', linewidth=1.4)
plt.hist(CLC[1], bins=50, density=True, label="St. John", histtype='step', linewidth=1.4)
plt.hist(CLC[2], bins=50, density=True, label="Hawaii", histtype='step', linewidth=1.4)
plt.legend();
plt.subplot(1,3,3)
plt.title('Betweenness Centrality distribution', fontsize=15)
plt.hist(B[0], bins=50, density=True, label="Washington", histtype='step', linewidth=1.4, log=True)
plt.hist(B[1], bins=50, density=True, label="St. John", histtype='step', linewidth=1.4, log=True)
plt.hist(B[2], bins=50, density=True, label="Hawaii", histtype='step', linewidth=1.4, log=True)
plt.xlim(0,0.15)
plt.legend();
As we can see, all cities preserve nearly the same distribution for degree, which makes sense, since road maps are confined to 2D geographic plane, which makes almost impossible for a node to connect with more than 5 or 6 others. The real distinction becomes clear for Closenness Centrality (CLC) and Betweenness Centrality (BC), even though they do not present much difference.
Once CLC is related to the shortest paths between two nodes and BC, to the number of shortest paths that passes through each node, the central plot shows that St. John's County has more numerous nodes that are closer to each other, making this city faster to navigate (go from point a to b takes, on average, shorter steps than on Washington city). However, the last plot shows that Washington, despite taking the longest runs between two points, is easier to navigate, since the small slant relates to the presence of more nodes with higher BC values, i.e., there are more nodes with more shortest paths going through them, making it easier to find alternatives if we were suddenly lost.
Those metrics can be also visualize by mapping node-by-node, on similar representation as Question 1. For this purpouse, we can modify the plot_network_measure_map() function as below.
def plot_multiple_network_measure_map(n, G, pos, dataFrame, measure_acronym, big_title=None, small_title=None):
"""
FUNCTION:
Plot a graph with matplotlib of the graph G and how the measurement of each node
varies within the range of this same measure. It also displays a colorbar on graphic lateral.
PARAMETERS:
n: Integer total metrics desired
G: Networkx graph object
pos: Networkx position layout object
measure_acronym: List of acronym used to define the measurement that will be considered
as variable through the network. It names the collumn in a pandas
dataframe
big_title: Network name
small_title: List of titles of the graphs plot
RETURN:
No return value
"""
plt.figure(1, figsize=(15,15))
plt.suptitle(big_title, fontsize=20)
for i in range(n):
acronym = measure_acronym[i]
data = list(dataFrame[acronym][k] for k in range(len(dataFrame)))
vmin = dataFr[acronym].min()
vmax = dataFr[acronym].max()
plt.subplot(2,2,i+1)
cmap = matplotlib.cm.get_cmap('cool')
normalize = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
colors = [cmap(normalize(value)) for value in data]
sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin = vmin, vmax=vmax))
sm._A = []
plt.colorbar(sm)
nx.draw_networkx(G, node_size=2, with_labels=False, pos=pos,
node_color=colors)
plt.title(small_title[i], fontsize=15)
plt.show(True)
dataFr = pd.DataFrame({'K':K[0], 'CLC':CLC[0], 'B':B[0]})
title = ('Degree variation', 'Closenness Centrality variation', 'Betweenness Centrality variation')
acronyms = ('K', 'CLC', 'B')
big_title = 'Washington city'
plot_multiple_network_measure_map(3, G_washing, pos_washing, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
dataFr = pd.DataFrame({'K':K[1], 'CLC':CLC[1], 'B':B[1]})
title = ('Degree variation', 'Closenness Centrality variation', 'Betweenness Centrality variation')
acronyms = ('K', 'CLC', 'B')
big_title = 'St. John\'s County'
plot_multiple_network_measure_map(3, G_stJohn, pos_stJohn, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
dataFr = pd.DataFrame({'K':K[2], 'CLC':CLC[2], 'B':B[2]})
title = ('Degree variation', 'Closenness Centrality variation', 'Betweenness Centrality variation')
acronyms = ('K', 'CLC', 'B')
big_title = 'Hawaii urban area'
plot_multiple_network_measure_map(3, G_hawaiiStreet, pos_hawaii, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
Unfortunately, those networks are densely populated with nodes, with not much varying measurements. Therefore those gradient representations turns out to be not as much efficient to data visualization as before.
Firstly, we must store each network in order to work with them
pos_protein, G_protein = read_txt_and_Gcc('/home/jorge/Documents/redes_complexas/data/ppi.maayan-vidal.txt')
pos_celegans4, G_celegans4 = read_txt_and_Gcc('/home/jorge/Documents/redes_complexas/wi2004.txt', comment='#', nodetype=str)
pos_celegans7, G_celegans7 = read_txt_and_Gcc('/home/jorge/Documents/redes_complexas/wi2007.txt', comment='#', nodetype=str)
pos_power, G_power = read_txt_and_Gcc('/home/jorge/Documents/redes_complexas/data/powergrid.txt')
pos_Rcolab, G_Rcolab = read_csv_and_Gcc('/home/jorge/Documents/redes_complexas/dependencies.csv', source='from', target='to')
K = [] #Stores Degree
EC = [] #Stores Eigenvector Centrality
PR = [] #Stores PageRank
CLC = [] #Stores Closenness Centrality
for i in range(5):
if i == 0:
G_i = G_protein
elif i == 1:
G_i = G_celegans4
elif i == 2:
G_i = G_celegans7
elif i == 3:
G_i = G_power
else:
G_i = G_Rcolab
vk = dict(G_i.degree())
vk = list(vk.values())
K.append(vk)
clc = dict(nx.closeness_centrality(G_i))
clc = list(clc.values())
CLC.append(clc)
ec = dict(nx.eigenvector_centrality(G_i, max_iter=1000))
ec = list(ec.values())
EC.append(ec)
pr = dict(nx.pagerank(G_i, alpha=0.85))
pr = list(pr.values())
PR.append(pr)
plt.figure(1, figsize=(15,15))
plt.subplot(3,2,1)
plt.title('Degree')
plt.hist(K[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4)
plt.hist(K[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4)
plt.hist(K[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4)
plt.hist(K[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4)
plt.hist(K[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4)
plt.xlim(0,60)
plt.legend();
plt.subplot(3,2,2)
plt.title('Eigenvector Centrality')
plt.hist(EC[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4)
plt.hist(EC[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4)
plt.hist(EC[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4)
plt.hist(EC[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4)
plt.hist(EC[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4)
plt.xlim(0,0.2)
plt.legend();
plt.subplot(3,2,3)
plt.title('PageRank')
plt.hist(PR[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4)
plt.hist(PR[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4)
plt.hist(PR[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4)
plt.hist(PR[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4)
plt.hist(PR[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4)
plt.xlim(0,0.01)
plt.legend();
plt.subplot(3,2,4)
plt.title('Closenness Centrality')
plt.hist(CLC[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4)
plt.hist(CLC[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4)
plt.hist(CLC[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4)
plt.hist(CLC[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4)
plt.hist(CLC[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4)
plt.legend();
It is possible to note that Degree, Eigenvector Centrality and PageRank distributions have a typical negative exponential distribution, except for R colaborators networks, which has a constant-like distributed degree. This also affects the Eigenvector Centrality (nodes have the lowest importance variation) and PageRank (more nodes are visited with the same frequency).
All the other networks (from Protein to Power grid) have the properties that almost all nodes have few connections (degree), also there are only a few nodes that have an expressive bigger importance than the others (eigenvector centrality) and a few nodes are visited expressively more frequently that the others.
The only metric immensely distinct from the other is the Closenness Centrality, which shows up to be approximatelly normally distributed for all networks. Power grid has its peak at the lowest value and R colaborators, at the highest one. Once this measurement is related to the average distance between two nodes, it does make sense that the distribution presents a well-defined mean (low standard deviation) because all nodes must be possible to be reached in a small time spam (there are not isolated nodes). This is more visible for power grid (lowest standard deviation), once a single problem on the network can be reached at almost the same time.
In order to better discuss those results, we can plot the logarithm of these graphics (K, EC and PR will be approximatelly linear).
plt.figure(1, figsize=(15,15))
plt.subplot(3,2,1)
plt.title('Degree')
plt.hist(K[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4, log=True)
plt.hist(K[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4, log=True)
plt.hist(K[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4, log=True)
plt.hist(K[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4, log=True)
plt.hist(K[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4, log=True)
plt.legend();
plt.subplot(3,2,2)
plt.title('Eigenvector Centrality')
plt.hist(EC[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4, log=True)
plt.hist(EC[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4, log=True)
plt.hist(EC[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4, log=True)
plt.hist(EC[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4, log=True)
plt.hist(EC[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4, log=True)
plt.legend();
plt.subplot(3,2,3)
plt.title('PageRank')
plt.hist(PR[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4, log=True)
plt.hist(PR[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4, log=True)
plt.hist(PR[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4, log=True)
plt.hist(PR[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4, log=True)
plt.hist(PR[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4, log=True)
plt.legend();
plt.subplot(3,2,4)
plt.title('Closenness Centrality')
plt.hist(CLC[0], bins=20, density=True, label="Protein", histtype='step', linewidth=1.4)
plt.hist(CLC[1], bins=20, density=True, label="C. elegans 2004", histtype='step', linewidth=1.4)
plt.hist(CLC[2], bins=20, density=True, label="C. elegans 2007", histtype='step', linewidth=1.4)
plt.hist(CLC[3], bins=20, density=True, label="Power grid", histtype='step', linewidth=1.4)
plt.hist(CLC[4], bins=20, density=True, label="R colaborators", histtype='step', linewidth=1.4)
plt.legend();
Without doing linear regressions (all graphs are already full of 'noisy' information), we can discuss them qualitatively. Especially in terms of Eigenvector centrality and PageRank. For EC, Power grid, R colaborator and protein/C. elegans networks happens to have the highest to the lowest slant (respectively) and for PR, the order almost remains the same since we get power grid with the highest slant and the others, nearly the same.
It did not presents a good visualization. Therefore, we can also use the plot_multiple_network_measure_map() function
def plot_multiple_network_measure_map(n, G, pos, dataFrame, measure_acronym, big_title=None, small_title=None, node_size=5):
"""
FUNCTION:
Plot a graph with matplotlib of the graph G and how the measurement of each node
varies within the range of this same measure. It also displays a colorbar on graphic lateral.
PARAMETERS:
n: Integer total metrics desired
G: Networkx graph object
pos: Networkx position layout object
measure_acronym: List of acronym used to define the measurement that will be considered
as variable through the network. It names the collumn in a pandas
dataframe
big_title: Network name
small_title: List of titles of the graphs plot
RETURN:
No return value
"""
plt.figure(1, figsize=(15,15))
plt.suptitle(big_title, fontsize=20)
for i in range(n):
acronym = measure_acronym[i]
data = list(dataFrame[acronym][k] for k in range(len(dataFrame)))
vmin = dataFr[acronym].min()
vmax = dataFr[acronym].max()
plt.subplot(2,2,i+1)
cmap = matplotlib.cm.get_cmap('cool')
normalize = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
colors = [cmap(normalize(value)) for value in data]
sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin = vmin, vmax=vmax))
sm._A = []
plt.colorbar(sm)
nx.draw_networkx(G, node_size=node_size, with_labels=False, pos=pos,
node_color=colors)
plt.title(small_title[i], fontsize=15)
plt.show(True)
dataFr = pd.DataFrame({'K':K[0], 'EC':EC[0], 'PR':PR[0], 'CLC':CLC[0]})
title = ('Degree variation', 'Eigenvector Centrality variation', 'PageRank variation', 'Closenness Centrality variation')
acronyms = ('K', 'EC', 'PR', 'CLC')
big_title = 'Human proteome (protein-protein interaction)'
plot_multiple_network_measure_map(4, G_protein, pos_protein, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
Now we can see clearer that human proteome presents only a few (bearly visible) nodes highly connected. Even though we can not see them on top-left figure, since the sidebar shows top-valued degree of 120 and it is built to go until the highest value on the network, we can assert that there is at least one highly connected node. Most of them lies on degree span of 1 to 20. Eigenvector centrality follows the same pattern, however for PageRank, despite having the same exponential distribution, it presents a low variation (between 0 and ~10$^{-2}$), and we can conclude that there is no preferential node (the highest connected ones are so dilluted that they do not represent a preferential 'travel point'). This can be also seen at Betweenness Centrality plot. The recurrent low degree makes all shortest paths passes through the nodes' majority, make their closenness nearly the same.
Those mentioned properties can be understandable for human proteome. If proteins had to have a intrincated dependancy on each other (PageRank wide-variable, keeping the power law and/or Closenness Centrality also a power law), random deleterious mutations would destroy all the network. This can be compare with $\textbf{Complexity}$ metric.
Complexity measurement is defined as
$$ \begin{equation} \alpha = \frac{\langle{k^2}\rangle}{\langle{k}\rangle} \end{equation} $$where k is the degree distribution. Therefore, the complexity is the ratio between second and first moment of a distribution. It is related to the robustness/fragility of a network, once it is associated with deviation from mean value. With this quantity, we can also define $f_{c}$ as
$$ \begin{equation} f_c = 1 - \frac{1}{\alpha - 1} \end{equation} $$standing for the fraction of nodes needed to be removed from the network to break it appart
def degree_distribution(G):
"""
FUNCTION:
Calculates the degree distribution and the probability associated with each data
PARAMETERS:
G: networkx graph object
RETURN:
tuple (kvalues, pk): kvalues: each line segmentation for histogram-plot purpose;
pk: probability associated with each line segmentation
"""
vk = dict(G.degree())
vk = list(vk.values())
maxk = np.max(vk)
mink = np.min(min)
kvalues = np.arange(0, maxk+1)
pk = np.zeros(maxk+1)
for k in vk:
pk[k] = pk[k] + 1
pk = pk/sum(pk)
return kvalues, pk
def moment_degree(G, m):
"""
FUNCTION:
Calculates the m-th moment from the degree distribution associated to some given graph G
PARAMETERS:
G: networkx graph object
RETURN:
float M: m-th moment
"""
k, pk = degree_distribution(G)
M = sum((k**m)*pk)
return M
def complexity(G_i):
"""
FUNCTION:
Calculates the complexity metric of a network and the breakdown node fraction, i.e., the fraction of nodes that
needs to be removed from graph making it falling apart
PARAMETERS:
G_i: Networkx graph object
RETURN:
tuple (complx, fc): complx: complexity measurement
fc: breakdown node fraction
"""
vk = dict(G_i.degree())
hist_y = list(vk.values())
count = 0.
mean = 0.
for i in hist_y:
mean+=i
count+=1
mean/=count
var = moment_degree(G_i, 2)
complx = var/mean
fc = 1 - 1/(complx - 1)
return complx, fc
print('Network\t\tN\tComplexity\tFraction to breakdown')
smp = complexity(G_protein)
smc4 = complexity(G_celegans4)
smc7 = complexity(G_celegans7)
smpw = complexity(G_power)
smr = complexity(G_Rcolab)
print('Proteome\t%d\t%.3f\t\t%.3f'%(G_protein.number_of_nodes(), smp[0], smp[1]))
print('C. elegans 04\t%d\t%.3f\t\t%.3f'%(G_celegans4.number_of_nodes(), smc4[0], smc4[1]))
print('C. elegans 07\t%d\t%.3f\t\t%.3f'%(G_celegans7.number_of_nodes(), smc7[0], smc7[1]))
print('Power grid\t%d\t%.3f\t\t%.3f'%(G_power.number_of_nodes(), smpw[0], smpw[1]))
print('R colaborators\t%d\t%.3f\t\t%.3f'%(G_Rcolab.number_of_nodes(), smr[0], smr[1]))
As we can see, biological networks (human proteome and C elegans neural network) are more robust than power grid network and less robust than interpersonal relations (R colaborators). One way to explain these relations are the dependency of a central supply on power grid, which does not occurs on biological network, precisely to avoid random failures leading to system shutdowns. For interpersonal relations, it is possible to maintain spatial very-long range connections, resulting in a even more robust network. (Unfortunately I could not find the error on calculation of the fraction to breakdown, once it could not display a negative value)
dataFr = pd.DataFrame({'K':K[1], 'EC':EC[1], 'PR':PR[1], 'CLC':CLC[1]})
title = ('Degree variation', 'Eigenvector Centrality variation', 'PageRank variation', 'Closenness Centrality variation')
acronyms = ('K', 'EC', 'PR', 'CLC')
big_title = 'C. elegans neural network 2004'
plot_multiple_network_measure_map(4, G_celegans4, pos_celegans4, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
Similarly to human proteome network, C. elegans neural network happens to have more visible nodes with high degree and PageRank varying in one magnitude order (from ~10$^{-3}$ to ~10$^{-2}$), presenting more relavant nodes than proteome's, although it shows a widespread closenness centrality, as proteome network. These differences make it more knotty than proteome, therefore more likely to be vulnerable to random failures (as we can see from the slightly decrease on complexity and on fraction of nodes to breakdown).
dataFr = pd.DataFrame({'K':K[2], 'EC':EC[2], 'PR':PR[2], 'CLC':CLC[2]})
title = ('Degree variation', 'Eigenvector Centrality variation', 'PageRank variation', 'Closenness Centrality variation')
acronyms = ('K', 'EC', 'PR', 'CLC')
big_title = 'C. elegans neural network 2007'
plot_multiple_network_measure_map(4, G_celegans7, pos_celegans7, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
The new-made neural newtork for C. elegans (3 years after the one showed before (2004 and 2007)) presents a small increase on degree and PageRank span, and a small decrease on Eigenvector Centrality and Closenness centrality. It does make sense, since the degree increase is associated with adding highly connected nodes (or increasing the connections of those already presented) which, in turn, increments their importance (PageRank), while reduce the average shortests paths (closenness) and reduce their important linkages (eigenvetor). Also, these variations leads to a (even though slightly, but still visible) decrease in complexity (and on the node fraction for brakdowns).
dataFr = pd.DataFrame({'K':K[3], 'EC':EC[3], 'PR':PR[3], 'CLC':CLC[3]})
title = ('Degree variation', 'Eigenvector Centrality variation', 'PageRank variation', 'Closenness Centrality variation')
acronyms = ('K', 'EC', 'PR', 'CLC')
big_title = 'Power grid network'
plot_multiple_network_measure_map(4, G_power, pos_power, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
Power grid network presents one of the most variations (along with R collaborators). It has a huge amount of nodes (the highest value between all studied networks) and also the most visible (comparing with biological networks) negative correlation between Eigenvector Centrality and Closenness Centrality. It is probably related to the presence of central power supplies, that does not make short paths between two nodes (it does connect with other important nodes (eigenvector), however avoids to be along short paths between two random ones (closenness)). Because of that, its robustness is cracked, decreasing in one magnitude order (the lowest complexity value among all other networks).
dataFr = pd.DataFrame({'K':K[4], 'EC':EC[4], 'PR':PR[4], 'CLC':CLC[4]})
title = ('Degree variation', 'Eigenvector Centrality variation', 'PageRank variation', 'Closenness Centrality variation')
acronyms = ('K', 'EC', 'PR', 'CLC')
big_title = 'R colaborators network'
plot_multiple_network_measure_map(4, G_Rcolab, pos_Rcolab, dataFr, measure_acronym=acronyms, big_title=big_title, small_title=title)
Finally, for R collaborators newtork, we can see the same properties as before, however node degree skyrockets to a maximum value of nearly 600, which is made possible mostly by geographicly long-range interactions (it is not limited by 2D nor 3D spaces, like the other ones). It also presents the highest values for Closenness centrality, PageRank and Eigenvector centrality, making them clearly visible on above representations as a few (if not one) darkish nodes.
It is worth noting the complexity and the fractions to breakdowns' values. This network has the most widespread distributions for all measurements, specially the degree distribution. It makes the variance (i.e. $\langle{k^2}\rangle$) bigger, increasing its complexity (therefore its robustness), which also increases the number of nodes needed to be removed to breakdowns (also the highest value). The long range interactions turns the network more difficult for breaking appart. It also spreads out the other measurements, creating shortcuts (closenness), instensifying the importance of few nodes (PageRank) and increasing the linkage between those few relevant nodes (eigenvector).
For this question, we must consider the same networks from the previous question, that is Human proteome, C. elegans 2004 and C. elegans 2007 neural networks, US power grid and R collaboration.
Making usage of Principal Component Analysis (PCA), we aim to discuss the similarities of the networks.
Firstly we need to calculate the distribution for each metric. This can be done with the already built function build_matrix_correl(). After that, for each measurement (resulting in a distribution over each node), we can calculate the distribution properties. Namely Average (AVG), Standard Deviation (STD), Second Moment (SM), Shannon Entropy (SHE). Therefore, a function that returns those measurements turns out to be worth to be built
def distribution_metrics(dataFrame):
"""
FUNCTION:
Determines some statistical metrics associated to each network measurement that is related to each node. In
other words, given some measurements distributions, it calculates statistical relavant parameters of them
PARAMETERS:
dataFrame: pandas DataFrame object
ex:
dataFrame = {'EC':EC, 'BC':BC}
where 'EC' stands for Eigenvector Centrality with type(EC) = list;
'BC' stands for Betweenness Centrality with type(BC) = list
RETURN:
list all_metrics: list of each statistical metrics calculated
"""
all_metrics = []
for i in dataFrame:
metrics = []
metrics.append(np.mean(dataFrame[i]))
metrics.append(np.std(dataFrame[i]))
metrics.append(stats.moment(dataFrame[i], moment=2))
metrics.append(stats.entropy(dataFrame[i]))
all_metrics.append(metrics)
return all_metrics
Now we can calculate those metrics for each dataFrame (set of measurements built from build_matrix_correl() function) for each network and store them on 'all_metrics' vector.
It is worth mentioned that 'all_metrics' vector is a layered with 3 levels:
0 - corresponds to each network (namely: Human proteome, C. elegans 2004, C. elegans 2007, Power grid, R collaborators);
1 - corresponds to each set of network measurements (namely: K, CLC, B, EC, PR, KC, ACC)
2 - corresponds to each statistical metric (namely: AVG, STD, SM, SHE)
all_metrics = []
dataFr = build_matrix_correl(G_protein)
all_metrics.append(distribution_metrics(dataFr))
dataFr = build_matrix_correl(G_celegans4)
all_metrics.append(distribution_metrics(dataFr))
dataFr = build_matrix_correl(G_celegans7)
all_metrics.append(distribution_metrics(dataFr))
dataFr = build_matrix_correl(G_power)
all_metrics.append(distribution_metrics(dataFr))
dataFr = build_matrix_correl(G_Rcolab)
all_metrics.append(distribution_metrics(dataFr))
Now, we can take the Human proteome as a network example. It has the first set of the four statistical metrics for each one of the seven network measurement. Therefore we can express it as the table below
print('\tAVG\t\t\tSTD\t\tSM\t\t\tSHE')
for i in range(7):
print(all_metrics[0][i])
Now we can decompose this 4x7 dimension vector into a 3D representation
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
x = pca.fit_transform(all_metrics[0])
x = np.transpose(x)
print(x)
y = pca.fit_transform(all_metrics[1])
y = np.transpose(y)
z = pca.fit_transform(all_metrics[2])
z = np.transpose(z)
a = pca.fit_transform(all_metrics[3])
a = np.transpose(a)
b = pca.fit_transform(all_metrics[4])
b = np.transpose(b)
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
plt.title('Principal Component Analysis (PCA) 3D', fontsize=15)
ax.scatter(x[:, 0], x[:, 1], x[:, 2], label='Human proteome')
ax.scatter(y[:, 0], y[:, 1], y[:, 2],label='C. elegans 2004')
ax.scatter(z[:, 0], z[:, 1], z[:, 2], label='C. elegans 2007')
ax.scatter(a[:, 0], a[:, 1], a[:, 2], label='Power grid')
ax.scatter(b[:, 0], b[:, 1], b[:, 2], label='R collaborators')
ax.legend()
plt.show()
And also, for better representation, wecan reduce to 2D
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
x = pca.fit_transform(all_metrics[0])
x = np.transpose(x)
y = pca.fit_transform(all_metrics[1])
y = np.transpose(y)
z = pca.fit_transform(all_metrics[2])
z = np.transpose(z)
a = pca.fit_transform(all_metrics[3])
a = np.transpose(a)
b = pca.fit_transform(all_metrics[4])
b = np.transpose(b)
fig = plt.figure(figsize=(5,5))
plt.title('Principal Component Analysis (PCA) 2D', fontsize=15)
ax = fig.add_subplot(111)
ax.scatter(x[:, 0], x[:, 1], label='Human proteome')
ax.scatter(y[:, 0], y[:, 1], label='C. elegans 2004')
ax.scatter(z[:, 0], z[:, 1], label='C. elegans 2007')
ax.scatter(a[:, 0], a[:, 1], label='Power grid')
ax.scatter(b[:, 0], b[:, 1], label='R collaborators')
ax.legend()
plt.show()
It can be noted that neural networkfrom C. elegans are closer from each other and the human proteome, power grid and R collaborators are wide spaced from each other, althought the Power grid and the Human proteome happens to be closer to C. elegans than R. collaborators network.
Once PCA represents the fact that the statistical metrics for each network characteristics are similar the closer they are, we can conclude that the R collab. network has the most different set of metrics compared to the others. It is expected, once it is the only one that is not subjected to physical space limitations, which provides to nodes the possibility of making connections with literally all the others. In order to better this huge difference, we can try drawing the network using another force-driven map, distinct from spring layout.
plt.figure(figsize=(20,20))
pos_Rcolab = nx.kamada_kawai_layout(G_Rcolab)
plt.title('R collaborators network', fontsize=20)
nx.draw_networkx(G_Rcolab, pos_Rcolab, node_size=10, width=.1, with_labels=False)
plt.show(True)
plt.figure(figsize=(20,20))
pos_power = nx.kamada_kawai_layout(G_power)
plt.title('Power grid', fontsize=20)
nx.draw_networkx(G_power, pos_power, node_size=10, width=.1, with_labels=False)
plt.show(True)
By thoses force-driven map (kamada-kawai layout), when comparing the diametral opposed networks on PCA (Power grid and R collaborators) it is clearer that on R collab. there are a few highly connected nodes and several nodes with low degree (the better representation of a scale-free network), on the other hand, for power grid, the network has a fewer differences between the nodes' degrees. Therefore, the network measurements presents big differences and their distributions, also distinct statistical metrics, such as mean and standard deviation, to quote a few.
This questions we will make it possible to compare the correlations between the node degree (K) and their nearest-neighbors degree (Knn). The assortativity metric is also associated with this knn x k, and we will also find the network assortativity. Firstly we can take a brief explanation about them
Assortativity measures the tendency of one node with degree k connects to another one with node k as well. It is calculated by finding the Pearson correlation between the degree of two connected nodes. Defined from -1 to 1, the closer to 1, the higher the assortativity (most likely to i-th node has degree k AND j-th node, connected to i, has degree k as well)
One of the ways to calculate assortativity take into account the following equation
$$ \begin{equation} r = \frac{1}{\sigma_a \sigma_b} \sum_{xy} x y \big(e_{xy} - a_x b_y\big), \end{equation} $$where $\textit{e}_{xy}$ stands for the fraction of all edges in the newtork that connects nodes with values (e.g. degree) $\textit{x}$ and $\textit{y}$. In this sense, $\textit{e}_{xy}$ has the properties
$$ \begin{equation} \sum_{xy} e_{xy} = 1; \quad \sum_y e_{xy} = a_x; \quad \sum_x e_{xy} = b_y; \end{equation} $$where $\textit{a}_x$ and $\textit{b}_y$ are the fraction of edges that starts at vertices withvalue x or y, respectively. And, finally, $\sigma_a$ and $\sigma_b$ are the standard deviation of $\textit{a}_x$ and $\textit{b}_y$.
Another way to detect communities on a network consists on checking the nodes' degree inside some fixed length around th i-th node. In another words, finding the degree (k) of the nearest neighbors (nn) of a given node i.
One way to make this study consists in compute the average neighbors degree
$$ \begin{equation} k_{nn,i} = \frac{1}{|N(i)|} \sum_{j \in N(i)} k_j \end{equation} $$where $N(i)$ represents the neighbors of node i and $|N(i)|$, its cardinality (i.e., the number of elements in N(i) vector).
Provided by this result, we can associate, for each i-th nodes' degree, its $knn$ value in order to find if there is some correlation between the i-th degree and the i-th neighbors' degree.
def build_knn_x_k(G):
"""
FUNCTION:
Construct the vectors containing each node's nearest neighbor degree
PARAMETERS:
G: networkx graph object
RETURN:
tuple (knn, knnk, ks): knn: vector with each node's average nearest neighbor's degree
knnk: vector with average nearest neghbor's for each degree
k: vector with degree of each node
"""
r = nx.degree_assortativity_coefficient(G)
print('Assortativity', '%.5f'%r)
knn = []
for i in G.nodes():
aux = nx.average_neighbor_degree(G, nodes=[i])
knn.append(float(aux[i]))
knn = np.array(knn)
print('Average degree of neighborhood ', '%.4f'%np.mean(knn))
knnk = list()
ks = list()
vk = dict(G.degree())
vk = list(vk.values())
for k in np.arange(np.min(vk), np.max(vk)):
aux = vk == k
if len(knn[aux]) > 0:
av_knn = np.mean(knn[aux])
knnk.append(av_knn)
ks.append(k)
return knn, knnk, ks
def plot_knn_k_regression(knnk, ks, title=None):
"""
FUNCTION:
Construct the plot with linear regression between metrics knnk (average of nearest neighbors) and k (degree),
also calculating the pearson's correlation over the data
PARAMETERS:
knnk: vector with average nearest neghbor's for each degree
ks: degree for each node
title: string with the title of the plot
RETURN:
No return
"""
plt.plot(ks, knnk, 'ro')
plt.title(title, fontsize=15)
plt.ylabel('Knn(k)')
plt.xlabel('k')
plt.grid(True)
par = np.polyfit(ks, knnk, 1, full=True)
slope=par[0][0]
intercept=par[0][1]
xl = [min(ks), max(ks)]
yl = [slope*xx + intercept for xx in xl]
plt.plot(xl, yl, '-b')
plt.show(True)
rho = np.corrcoef(ks, knnk)[0,1]
print('Pearson correlation ', rho)
import scipy.stats as sts
s = sts.spearmanr(ks,knnk)
print(s)
knn, knnk, ks = build_knn_x_k(G_euroroad)
plot_knn_k_regression(knnk, ks, 'Euroroad map network')
As before mentioned, it is expected that every neighborhood have low connections (road maps are phisically limited), which is approximately 3.
With the plotted graph above, it is evident that $knn$ is highly correlated with $k$, meaning that highly connected cities (nodes) tend to connect with other highly connected ones.
Using the same method for visualizing data presented before, we can see how $knn$ metric is distributed over the map.
vk = dict(G_euroroad.degree())
vk = list(vk.values())
dataFr = pd.DataFrame({'vk':vk, 'knn':knn})
title = ('Degree', 'k-nearest neighbors')
acronyms = ('vk', 'knn')
big_title = 'Euroroad map'
plot_multiple_network_measure_map(2, G_euroroad, pos_euro, dataFr, measure_acronym=acronyms,
big_title=big_title, small_title=title, node_size=10)
knn, knnk, ks = build_knn_x_k(G_celegans)
plot_knn_k_regression(knnk, ks, 'C. elegans neural network')
The neural network from C. elegans presents the opposite result when comparing with the Euroroad map. That means knn and k are negatively correlated with moderate correlation (it is approximatelly -0.5 and since correlation is measured between [-1,1], the correlationis said to be moderate). In other words, nodes with high degree happens to have connections with low-connected nodes.
One way to glimpse this result is analysing that there are only a few highly connected neurons, and only the ones that are directly connected with them have high knn. We can conclude that this network has nodes that are more important then others.
In assortativity terms, there is a moderate neuronal preference to attach to ones which have low connections.
vk = dict(G_celegans.degree())
vk = list(vk.values())
dataFr = pd.DataFrame({'vk':vk, 'knn':knn})
title = ('Degree', 'k-nearest neighbors')
acronyms = ('vk', 'knn')
big_title = 'C. elegans neural network'
plot_multiple_network_measure_map(2, G_celegans, pos_celegans, dataFr, measure_acronym=acronyms,
big_title=big_title, small_title=title, node_size=10)
knn, knnk, ks = build_knn_x_k(G_power)
plot_knn_k_regression(knnk, ks, 'Power grid network')
For the powergrid network, distinctly from the others, there is almost no correlation between knn and k. That means that the highly connected nodes do not happen to have closer neighbors also highly connected. We can conclude that once the correlation.
In terms of this network, even though the power distribution has a central power supply (as mentioned in previous results) they do not make multiple connections and also do not their neighbors (power lines). It can also be seen on assortativity metric, once it is close to zero, meaning that there is no preferential attachment between power lines that have or have not multiple linkages.
vk = dict(G_power.degree())
vk = list(vk.values())
dataFr = pd.DataFrame({'vk':vk, 'knn':knn})
title = ('Degree', 'k-nearest neighbors')
acronyms = ('vk', 'knn')
big_title = 'Power grid network'
plot_multiple_network_measure_map(2, G_power, pos_power, dataFr, measure_acronym=acronyms,
big_title=big_title, small_title=title, node_size=6)
knn, knnk, ks = build_knn_x_k(G_protein)
plot_knn_k_regression(knnk, ks, 'Human protein network')
Finally for the last network, we have the extreme opposite from euroroad map with its positive high correlation. For human proteome, we have the highest negative correlation between knn and k, meaning that nodes with several connections have the most low-connected neighborhood.
In terms of proteins, in humans the proteins are highly interconnected and it does make sense that proteins that make more connections (can be thought to be proteins that other ones depend on) should have links with the lowest connected ones, because if this highly connected protein happens to failed, only a few that directly deppends on them would be affected. The assortativity (despite subtle) also shows the preference to one protein makes connection with low connected ones.
vk = dict(G_protein.degree())
vk = list(vk.values())
dataFr = pd.DataFrame({'vk':vk, 'knn':knn})
title = ('Degree', 'k-nearest neighbors')
acronyms = ('vk', 'knn')
big_title = 'Human proteome protein-protein interaction'
plot_multiple_network_measure_map(2, G_protein, pos_protein, dataFr, measure_acronym=acronyms,
big_title=big_title, small_title=title, node_size=6)
For the next questions, we will focus on community detection using some standard algorithms. Some of them are the Girvan-Newman, Louvain, Fast greedy and label propagation. In sequence, we will quickly explain them.
The Girvan-Newman method detects communities by taking into account the betweenness centrality of each edge and removing the highest valued ones, in order to break the communities appart.
Once betweenness centrality measurement considers all the possible shortest path (not only the average, for example) and consider, for the i-th node, the ones over which the path go through i. Therefore, considering $\eta (a,b)$ the total count number of shortest paths between nodes a and b and $\eta (a,i,b)$ the ones that passes through node i,
$$ \begin{equation} B_i = \sum_{a,b} \frac{\eta(a,b)}{\eta(a,i,b)}. \end{equation} $$Another measurable which correlates to betweenness centrality considers random walk particles. In this formulation,
$$ \begin{equation} B_i = \sum_{a=1}^N \sum_{b=1}^N w(a,i,b), \end{equation} $$where $\textit{w}(a,i,b)$ stands for the number of times node i is visited. Once it is stochastic, the number of steps considered and the average visitation will varied.
The Louvain method for partitioning a network in subgraphs consists in the optimization of Modularity . This is accomplished by selecting the small communities and merging them into one unique node. $\textit{Modularity}$ is defined by the equation
$$ \begin{equation} Q = \frac{1}{2m} \sum_{ij} \Bigg[A_{ij} - \frac{k_i k_j}{2m} \Bigg] \delta(c_i, c_j) \end{equation} $$where $\textit{A}_{ij}$ represents the edge weight between nodes i and j; $\textit{k}_i$ and $\textit{k}_j$ are the sum of the weights assoiated with edges that connects with i and j; $\textit{m}$ is the sum of all edges weights on the whole graph (normalization constant. The double counting is considered by the factor 2); $\textit{c}_i$ and $\textit{c}_j$ are the communities of the nodes (the Kronecker delta turns the element contribution nulled for the sum if i-th and j-th nodes are not in the same community).
Part of the algorithm efficiency comes from moving the already partitioned network into one unique node, computed by
$$ \begin{equation} \Delta Q = \Bigg[\frac{\sum_{in} + k_{i,in}}{2m} - \Bigg(\frac{\sum_{tot} + k_i}{2m} \Bigg)^2\Bigg] - \Bigg[ \frac{\sum_{in}}{2m} - \Bigg(\frac{\sum_{tot}}{2m}\Bigg)^2 - \Bigg(\frac{k_i}{2m}\Bigg)^2\Bigg] \end{equation} $$Each term on the equation represents the quantities as follow:
$\sum_{in}$ is the sum on the weights of connections inside $\textit{C}$ community; $\sum_{tot}$ is the sum of the weighted links incident to nodes in community $\textit{C}$; $\textit{k}_{i,in}$ is the sum of weights of links from i-th node to nodes inside $\textit{C}$ and, finally, $\textit{m}$ is the sum of all weights on the whole network.
The modularity is related to the quality of the best partition, therefore the fast-greedy method consider the optimization to get the best modularity. Also known as Clauset-Newman-Moore greedy modularity algorithm
Once we will need to separate the networks by their communities and each method return different types of data, we will build functions that will receive those communities partitions and standardize them into n-dimensional vectors (where n is the number of communities detected)
def generator_to_list(communities_generator, isSet=False, get_node_communities=False):
"""
FUNCTION:
Build a list of communities generated by some partition method
PARAMETERS:
communitites_generator: iteration-support generator object
isSet: support cases where each node that belongs to some community is represented as a set
RETURN:
list c: n-dimensional vector containing vectors of each community detected
"""
comm = list()
numb_comm = 0
if isSet == True:
k = 2
for i in np.arange(0, k-1):
comm = next(communities_generator)
else:
for i in communities_generator:
comm.append(list(i))
numb_comm+=1
comm = iter(comm)
if get_node_communities == True:
nodes_comm = []
count = 0
for c_i in comm:
nodes_comm.extend([count]*len(c_i))
count+=1
return nodes_comm
c = sorted(map(sorted, comm))
return c
def dict_generator_to_list(dict_communities_generator, get_node_communities=False):
"""
FUNCTION:
Build a list of communities generated by some partition method
PARAMETERS:
communitites_generator: dictionary with keys representing their nodes and values, their communities
get_node_communities: bool. If True, return the community for each node in a single vector
RETURN:
list c: n-dimensional vector containing vectors of each community detected
"""
val = dict_communities_generator.values()
count = 0.
comm = []
for com in set(val) :
count = count + 1.
list_nodes = [nodes for nodes in dict_communities_generator.keys() if dict_communities_generator[nodes] == com]
comm.append(list_nodes)
if get_node_communities == True:
return val
return comm
Now we can generate our benchmark to be studied. This one will be Fortunato's benchmark
from networkx.algorithms.community import LFR_benchmark_graph
N = 128
tau1 = 3
tau2 = 1.5
mu = 0.04
k =16
minc = 32
maxc = 32
G = LFR_benchmark_graph(n = N, tau1 = tau1, tau2 = tau2, mu = mu, min_degree = k,
max_degree = k, min_community=minc, max_community = maxc, seed = 10)
pos=nx.spring_layout(G)
nx.draw(G, with_labels = False, nodecolor='r', edge_color='b',
node_size=50, font_size=16, width=1,pos = pos)
plt.show(True)
from networkx.algorithms import community
communities_generator = community.girvan_newman(G)
communities_generator = generator_to_list(communities_generator, isSet=True)
comm_GN = communities_generator
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50)
aux = aux + 1
plt.show(True)
communities_generator = community.greedy_modularity_communities(G)
communities_generator = generator_to_list(communities_generator)
comm_FG = communities_generator
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50)
aux = aux + 1
plt.show(True)
communities_generator = community.label_propagation_communities(G)
communities_generator = generator_to_list(communities_generator)
comm_LP = communities_generator
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50)
aux = aux + 1
plt.show(True)
from community import community_louvain
communities_generator = community_louvain.best_partition(G)
communities_generator = dict_generator_to_list(communities_generator)
comm_LV = communities_generator
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50)
aux = aux + 1
plt.show(True)
With the results above, we can see that Girvan-Newman was the only method that returns a different number of communities. Once it was expected to have 4 communities, we can compare the other methods (namely, Louvain, fast greedy and label propagation) with the Normalized Mutual Information (NMI).
from sklearn import metrics
labels_true = comm_FG
labels_pred = comm_LV
comm_FG_LV = []
for i in range(4):
comm1 = metrics.normalized_mutual_info_score(labels_true[i], labels_pred[i])
comm_FG_LV.append(comm1)
labels_pred = comm_LP
comm_FG_LP = []
for i in range(4):
comm1 = metrics.normalized_mutual_info_score(labels_true[i], labels_pred[i])
comm_FG_LP.append(comm1)
labels_true = comm_LP
labels_pred = comm_LV
comm_LP_LV = []
for i in range(4):
comm1 = metrics.normalized_mutual_info_score(labels_true[i], labels_pred[i])
comm_LP_LV.append(comm1)
print(comm_FG_LV)
print(comm_FG_LP)
print(comm_LP_LV)
As we can see, each collumn represents a community and each row, the comparison between FG and LV, FG and LP, LP and LV. They all are equal to 1.0, therefore all methods results on the same node partition
We could see that almost all methods result in the same partitions. Now we can analyse the normalized mutual information for several similarly generated Fortunato's benchmark network in order to study the transition between the presence of some distinct communities and their merging. For example, below we show three distinct networks created with Fortunato benchmark, however up changing the mixing parameter $\mu$ from 0.03 (it will be our 'true' valued network for comparisons) to 0.1 passing through 0.04 and 0.06. The number of nodes is fixed and equal to 800
N = 800
tau1 = 3
tau2 = 1.5
mu_values = [0.03, 0.04, 0.06, 0.1]
k = 16
minc = 32
maxc = 32
for mu in mu_values:
G = LFR_benchmark_graph(n = N, tau1 = tau1, tau2 = tau2, mu = mu, min_degree = k,
max_degree = k, min_community=minc, max_community = maxc, seed = 10)
pos = nx.spring_layout(G)
nx.draw(G, with_labels = False, nodecolor='r', edge_color='b',
node_size=50, font_size=16, width=.2,pos = pos)
plt.title('Network with mixing parameter {}'.format(mu), fontsize=16)
plt.show(True)
It is possible to be seen that the for $\mu < 0.04$ there are no connections between communities (they are not mixed). However when it becomes 0.04, they startto mixing up and, for crescent values, communities seems to have dissoluted. Now we can compare this process quantitatively with mutual normalized information
from sklearn import metrics
N = 800
tau1 = 3
tau2 = 1.5
mu_values = np.arange(0.1,1,0.01)
#mu_values = [0.8]
k =16
minc = 32
maxc = 32
G = LFR_benchmark_graph(n = N, tau1 = tau1, tau2 = tau2, mu = 0.04, min_degree = k,
max_degree = k, min_community=minc, max_community = maxc, seed = 10)
communities_generator = community_louvain.best_partition(G)
communities_generator = dict_generator_to_list(communities_generator, get_node_communities=True)
comm_true = list(communities_generator)
nmi_values = []
for mu in mu_values:
nmi_mean = 0
trials = 10
for i in range(trials):
G = LFR_benchmark_graph(n = N, tau1 = tau1, tau2 = tau2, mu = mu, min_degree = k,
max_degree = k, min_community=minc, max_community = maxc, seed = 10)
communities_generator = community_louvain.best_partition(G)
communities_generator = dict_generator_to_list(communities_generator, get_node_communities=True)
comm_LV = list(communities_generator)
nmi = metrics.normalized_mutual_info_score(comm_true, comm_LV)
nmi_mean+=nmi
nmi_values.append(nmi_mean/trials)
plt.plot(mu_values, nmi_values, 'b-.')
plt.title('Fortunato benchmark', fontsize=15)
plt.ylabel('Normalized Mutual Information', fontsize=13)
plt.xlabel('Mixing parameter', fontsize=13)
plt.show()
We can understand this result as how the mixing parameter affects the community partition on Fortunato's benchmark. It is clearly visible on network plots above that for $\mu \geq 0.04$ the communities start to mixing up and, as result, the mutual information starts to falling down until there are no relation between the communities from the benchmark with $\mu=0.04$ The plot was made with the mean over 10 runs at each point
On this question it is expected to have the visualization for zachary karate club network using each community detection method already mentioned. That is Girvan-Newman, Louvain, Fast greedy and Label propagation. At first, let us make the standard network
G = nx.read_edgelist('/home/jorge/Documents/redes_complexas/zachary.txt', nodetype=int)
G = G.to_undirected()
G = nx.convert_node_labels_to_integers(G, first_label=0)
pos = nx.fruchterman_reingold_layout(G)
nx.draw_networkx(G, pos=pos, node_color='b', node_size=200)
plt.show(True)
Now we can use our previous defined functions to take the communities from each method, also previously mentioned
communities_generator = community.centrality.girvan_newman(G)
communities_generator = generator_to_list(communities_generator, isSet=True)
comm_GN = np.array(communities_generator)
commf = comm_GN.flatten()
print(commf)
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=200)
aux = aux + 1
plt.show(True)
communities_generator = community.greedy_modularity_communities(G)
communities_generator = generator_to_list(communities_generator)
comm_FG = communities_generator
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=200)
aux = aux + 1
plt.show(True)
communities_generator = community.label_propagation_communities(G)
communities_generator = generator_to_list(communities_generator)
comm_LP = communities_generator
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=200)
aux = aux + 1
plt.show(True)
communities_generator = community_louvain.best_partition(G)
communities_generator = dict_generator_to_list(communities_generator)
comm_LV = communities_generator
print('Total communities: ', len(communities_generator))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=200)
aux = aux + 1
plt.show(True)
On this question, we need to examine relations similarly to question 6, however using Fortunato's benchmark without a maximum community parameter. Therefore, as before, firstly we will draw the benchmark network generated
from networkx.algorithms.community import LFR_benchmark_graph
n = 250
tau1 = 3
tau2 = 1.5
mu = 0.1
G = LFR_benchmark_graph(n, tau1, tau2, mu, average_degree=5, min_community=20, seed=10)
pos=nx.spring_layout(G)
nx.draw(G, with_labels = False, nodecolor='r', edge_color='b',
node_size=50, font_size=16, width=1,pos = pos)
plt.show(True)
For this detections, we will see that the number of communities detected will be greater than matplotlib standard colormap. thus we need to create a list with several other colors, mostly related to an hexadecimal number
from matplotlib import colors as mcolors
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
colors = list(colors.keys())
After that, we can follow thesame steps as before for communities detection
communities_generator = community.centrality.girvan_newman(G)
communities_generator = generator_to_list(communities_generator, isSet=True)
comm_GN = communities_generator
print('Total communities: ', len(communities_generator))
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50)
aux = aux + 1
plt.show(True)
communities_generator = community.greedy_modularity_communities(G)
communities_generator = generator_to_list(communities_generator)
comm_FG = communities_generator
print('Total communities: ', len(communities_generator))
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50)
aux = aux + 1
plt.show(True)
communities_generator = community.label_propagation_communities(G)
communities_generator = generator_to_list(communities_generator)
comm_LP = communities_generator
print('Total communities: ', len(communities_generator))
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50, )
aux = aux + 1
plt.show(True)
communities_generator = community_louvain.best_partition(G)
communities_generator = dict_generator_to_list(communities_generator)
comm_LV = communities_generator
print('Total communities: ', len(communities_generator))
plt.figure()
aux = 0
for cm in communities_generator:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color = colors[aux], with_labels = False, node_size=50)
aux = aux + 1
plt.show(True)
As mentioned above, the number of partitioned communities is huge, and we can measure the disagregationin terms of Normalized Mutual Information (NMI), also made on previous questions
Fisrtly we can see how $\mu$ alters the network visual.
from networkx.algorithms.community import LFR_benchmark_graph
n = 800
tau1 = 3
tau2 = 1.5
mu_values = [0.04, 0.08, 0.1, 0.5]
for mu in mu_values:
G = LFR_benchmark_graph(n, tau1, tau2, mu, average_degree=5, min_community=20, seed=10)
pos=nx.spring_layout(G)
nx.draw(G, with_labels = False, nodecolor='r', edge_color='b',
node_size=20, font_size=16, width=1,pos = pos)
plt.title('Network with mixing parameter {}'.format(mu), fontsize=16)
plt.show(True)
Similarly to the other network with varying mixing parameter, as it increases, the more difficult it is to visually separate the communities from each other. Let us make a quantitative measure over it
from sklearn import metrics
from networkx.algorithms import community
n = 1600
tau1 = 3
tau2 = 1.5
mu_values = np.arange(0.1,1,0.01)
#mu_values = [0.1]
G = LFR_benchmark_graph(n, tau1, tau2, mu=0.04, average_degree=5, min_community=40, seed=10)
communities_generator = community.label_propagation_communities(G)
communities_generator = generator_to_list(communities_generator, isSet=False, get_node_communities=True)
comm_true = (communities_generator)
nmi_values = []
for mu in mu_values:
nmi_mean = 0
trials = 50
for i in range(trials):
G = LFR_benchmark_graph(n, tau1, tau2, mu, average_degree=5, min_community=40, seed=10)
communities_generator = community.label_propagation_communities(G)
communities_generator = generator_to_list(communities_generator, isSet=False, get_node_communities=True)
comm_GN = communities_generator
nmi = metrics.normalized_mutual_info_score(comm_true, comm_GN)
nmi_mean+=nmi/trials
nmi_values.append(nmi_mean)
plt.plot(mu_values, nmi_values, 'b-.')
plt.title('Fortunato benchmark', fontsize=15)
plt.ylabel('Normalized Mutual Information', fontsize=13)
plt.xlabel('Mixing parameter', fontsize=13)
plt.show()
We can see that for $\mu < 0.7$ the Fortunato benchmark presents low variation on communities partition, since all show nearly the same high-scored NMI (~0.8). Suddenly, after 0.7 the network starts to form relatively non-stable clusters, and their NMI metric falls significantly (to ~0.3). Qualtatively, we can see this evolution on networks drawn above on this question.
Using the alread read networks, we will make the study over Euroroad map, C. elegans neural network, US airport and human proteome networks. The measurements made will be number of nodes (N), average degree (K), Assortativity (ASS), Average shortest path (ASP) and Modularity (MOD)
Some of the community detection algorithms generates a type called 'generator'. In order to standardize them all, we can build a function to store the communities in a list-type variable
N = [] #Stores number of nodes
K = [] #Stores average degree
ASS = [] #Stores Assortativity
ASP = [] #Stores Average shortest Path
MOD = [] #Stores Modularity
from networkx.algorithms import community
from community import community_louvain
for i in range(4):
if i == 0:
G_i = G_euroroad
elif i == 1:
G_i = G_celegans
elif i == 2:
G_i = G_usair
else:
G_i = G_protein
n = len(G_i)
N.append(n)
M = G_i.number_of_nodes()
K.append(2*M/n)
r = nx.degree_assortativity_coefficient(G_i)
ASS.append(r)
ASP.append(nx.average_shortest_path_length(G_i))
#Fast greedy
communities_generator = community.greedy_modularity_communities(G_i)
mod = community.modularity(G_i, communities_generator)
MOD.append(mod)
#Label propagation
communities_generator = community.label_propagation_communities(G_i)
communities_generator = generator_to_list(communities_generator)
mod = community.modularity(G_i, communities_generator)
MOD.append(mod)
#Givan Newman
communities_generator = community.centrality.girvan_newman(G_i)
communities_generator = generator_to_list(communities_generator, isSet=True)
mod = community.modularity(G_i, communities_generator)
MOD.append(mod)
#Louvain
communities_generator = community_louvain.best_partition(G_i)
communities_generator = dict_generator_to_list(communities_generator)
mod = community.modularity(G_i, communities_generator)
MOD.append(mod)
print(i)
print('Done!')
MOD1 = [MOD[i] for i in range(0,16,4)]
MOD2 = [MOD[i] for i in range(1,16,4)]
MOD3 = [MOD[i] for i in range(2,16,4)]
MOD4 = [MOD[i] for i in range(3,16,4)]
df = pd.DataFrame({'N':N, 'K':K, 'ASS':ASS, 'ASP':ASP, 'MOD (FSTGRDY)':MOD1,
'MOD (LBLPRPGT)':MOD2, 'MOD (GVNNWMN)':MOD3, 'MOD (LVAIN)':MOD4})
df
Each row on this table represents one network. In order,
0 - Euroroad map, 1 - C. elegans neural network, 2 - US airport, and 3 - human proteome networks.
Comparing each value from the table, we can reorganize them by sorting each collumn. Thus we have
$\textbf{ASS}$ = [2, 1, 3, 0]
$\textbf{ASP}$ = [1, 2, 3, 0]
$\textbf{MOD (Fast Greedy)}$ = [2, 1, 3, 0]
$\textbf{MOD (Label Propagation)}$ = [1, 2, 3, 0]
$\textbf{MOD (Girvan-Newman)}$ = [1, 3, 2, 0]
$\textbf{MOD (Louvain)}$ = [2, 1, 3, 0]
By comparting these networks (it is worth mentioned that it does not imply that the correlations always hold), we can conclude that assortativity matches precisely with the modularity found by Fast-greedy and Louvain methods.
We can work on this result by comparing how each community detection method works. The Louvain method checks the modularity of small communities and merge the nodes into one unique node that carries the modularity information. Comparing this process with the ideia of assortativity measurement, i.e., the tendency of nodes with k connections to make links with other nodes with the same k edges, it appears to correlate with the Louvain method.
For Fast-greedy, it tries to optimizes the modularity by assign each node to a community and then select a pair and compute the new modularity. It keeps merging only if the modularity grows. Even thought he methods are not equal, they work with the same concept that merge similar-degree nodes. Therefore, such as the above paragraph, it does make sense that there is a correlation between assortativity and Fast-greedy method.
Suggestion: Based on Osame Kinouchi and Mauro Coppeli article 'Optimal Dynamical Range of Excitable Networks at Criticallity' (on arXiv: https://arxiv.org/abs/q-bio/0601037), we tried to reproduce the results and vary some parameters in order to study the model.
This suggestion is based on study made on Statistics course, which involves the model's statistical construction as well as the analysis of the critical behavior. It is a well-designed experiment for this course's sake, once it considers Erdos-Renyi graph structure and the evolution of dynamic process on networks (future topics for this subject)
The ideia is build the environment from scretch, without using the networkx library. After, we can extrapolate the results for other random graphs structures and compare the results obtained.
import matplotlib.pyplot as plt
import numpy as np
Firstly, we must define the vector of states of each node of the network
def state_vector(N):
"""
FUNCTION:
Build a state vector that will be associated with each of the N nodes
PARAMETERS:
N: number of nodes on the network
RETURN:
vet: list, containing N elements of null-state
"""
v = [0 for i in range(N)]
return v
Now, we must build an adjacency matrix (one of the easiest representation, although it has some memory problem, once the matrix is sparse and most of the memory is not well-used)
def build_adjacency_matrix(N, K, sigma):
"""
FUNCTION:
Build an adjacency matrix with random connections (based on Erdos-Renyi network)
PARAMETERS:
N: Total node count
K: Parameter that indicates the maximum probability of connection between two nodes
sigma: Parameter that indicates the maximum probability of connection between two nodes
RETURN:
mat: 2x2 adjacency matrix
"""
mat = np.zeros((N,N))
Pmax = 2*sigma/K
ki = 0
while ki < int(N*K/2):
i, j = int(np.random.rand()*N), int(np.random.rand()*N)
if mat[i][j] == 0. and i != j:
mat[i][j] = np.random.rand()*Pmax
mat[j][i] = mat[i][j]
ki+=1
return mat
Now, we can define a function capable to generate random external impulses to mimic neural stimuli. It will be generated following the Poisson's process.
def exogenous_pulse(N, state_vect, r):
"""
FUNCTION:
Generates an exogenous pulse, activating (i.e. turning the node state into 1, if 0) some random nodes
PARAMETERS:
N: Total number of nodes on the network
state_vect: instant state vector associated with each node
r: parameter analogous to a proper time responsible to generate the pulse
RETURN:
No return
"""
e = 0
while e < N:
y = -(1/r)*np.log(1 - np.random.rand())
if y < r and state_vect[e] == 0:
state_vect[e] = 1
e+=1
return
The first step must be done by changing the once zeroed-state vector at random. It is made by filtering only the currently activated node and, with some uniform probability, change each of their neighbors
def initial_step(N, state_vect, adj_matrix):
"""
FUNCTION:
First node activation by its neighbors
PARAMETERS:
N: Total amount of nodes
state_vect: Instant state vector associate with each node
adj_matrix: Adjacency matrix (fixed) for the network
RETURN:
No return
"""
init_transit_vect = adj_matrix*state_vect
n = 0
while n < N:
if np.random.rand() < init_transit_vect[n] and state_vect[n] == 0:
state_vect[n] = 1
elif state_vect[n] == 1:
state_vect[n] += 1
n+=1
return
Now we can make the dynamical process execution. In order to model the neurons with refractory states (i.e. states right after an activation that, after a specific time span, cannot be activated again)
def neural_evolution(N, T, state_vect, adj_matrix, num_refractory_state):
"""
FUNTION:
Update the neurons state, executing the dynamical process on the network. At the end, plots the instant
activity of the network (that is, the total amount of neurons that is activated at each time step)
PARAMETERS:
N: Node total amount
T: Time span [0,T] over which the neuronal network will be evaluated
state_vect: current state vector associated to each node
adj_matrix: Adjacency matrix (fixed)
num_refractory_state: Number of refractory state (not capable to be activated)
RETURN:
No return
"""
probab_transit_matrix = np.matmul(adj_matrix, state_vect)
instant_activ_vector = [0 for l in range(T)]
t = 0
while t < T:
actives = 0
transient_vect = [0 for i in range(N)]
#if t > 100 and t < 300:
#exogenous_pulse(N, state_vect, 0.5)
n = 0
while n < N:
if np.random.rand() < probab_transit_matrix[n] and state_vect[n] == 0:
transient_vect[n] = 1
else:
transient_vect[n] = 0
if state_vect[n] == 1:
actives+=1
if state_vect[n] != 0:
state_vect[n] = (state_vect[n]+1)%(num_refractory_state+1)
n+=1
instant_activ_vector[t] = actives/N
state_vect = [state_vect[j] + transient_vect[j] for j in range(N)]
probab_transit_matrix = np.matmul(adj_matrix, transient_vect)
t+=1
plot_instant_activity(T, instant_activ_vector)
return
def time_evolution(N, T, state_vect, adj_matrix, num_refractory_states):
"""
FUNTION:
Update the neurons state, executing the dynamical process on the network. At the end, plots the instant
activity of the network (that is, the total amount of neurons that is activated at each time step)
PARAMETERS:
N: Node total amount
T: Time span [0,T] over which the neuronal network will be evaluated
state_vect: current state vector associated to each node
adj_matrix: Adjacency matrix (fixed)
num_refractory_state: Number of refractory state (not capable to be activated)
RETURN:
list: instant state vector for each node
"""
initial_step(N, state_vect, adj_matrix)
transient_states_vect = [0 for i in range(N)]
instant_activity_vect = [0 for j in range(T)]
t = 0
#print("Arquivo n =", num_refractory_states)
arq = open(str(num_refractory_states)+'.dat', 'w')
while t < T:
actives = 0
if t >= 100 and t <= 300:
r = 0.5
exogenous_pulse(N, state_vect,r)
"""if t == 100:
r = 0.5
if t > 100 and t < 700:
r+=0.001
#print(r)
exogenous_pulse(N, state_vect,r)"""
'''elif t == 400:
r = 0.8
elif t == 700:
r = 0.2
if t > 100 and t < 300:
r-=0.001
print(r)
exogenous_pulse(N, state_vect, r)
if t > 400 and t < 600:
print(r)
r+=0.001
exogenous_pulse(N, state_vect, r)
if t > 700 and t < 800:
print(r)
r-=0.001
exogenous_pulse(N, state_vect, r)
'''
n = 0
while n < N:
if state_vect[n] > 1 and state_vect[n] < num_refractory_states:
transient_states_vect[n] = 0
elif state_vect[n] == num_refractory_states:
state_vect[n] = 0
elif state_vect[n] == 1:
transient_states_vect[n] = 1
actives+= 1
n+=1
transit_vect = adj_matrix*transient_states_vect
m = 0
while m < N:
if np.random.rand() < transit_vect[m] and state_vect[m] == 0:
state_vect[m] = 1
elif state_vect[m] > 0:
state_vect[m] += 1
m+=1
instant_activity_vect[t] = actives/N
arq.write(str(actives/N))
arq.write('\n')
t += 1
arq.close()
return instant_activity_vect
def edges_count(N, adj_matrix):
"""
FUNCTION:
Counts the number of edges of each node
PARAMETERS:
N: Total amountof nodes
adj_matrix: Adjacency matrix (fixed)
RETURN:
list: number of edges of each node. Degree distribution
"""
conn_vect = [0 for i in range(N)]
n = 0
while n < N:
m = 0
conn = 0
while m < N:
if adj_matrix[n][m] != 0:
conn += 1
m+=1
conn_vect[n] = conn
n+=1
return conn_vect
N = 1000
K = 10
sigma = 1
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = build_adjacency_matrix(N, K, sigma)
#system characterization
vect_connect = edges_count(N, adj_matrix)
plt.hist(vect_connect, histtype=u'step', bins=50, density=True, label='K = 10')
K = 5
adj_matrix = build_adjacency_matrix(N, K, sigma)
vect_connect = edges_count(N, adj_matrix)
plt.hist(vect_connect, histtype=u'step', bins=50, density=True, label='K = 05')
K = 20
adj_matrix = build_adjacency_matrix(N, K, sigma)
vect_connect = edges_count(N, adj_matrix)
plt.hist(vect_connect, histtype=u'step', bins=50, density=True, label='K = 20')
plt.title('Degree distribution. Multiple K', fontsize=15)
plt.ylabel('P(k)')
plt.xlabel('Degree k')
plt.legend()
plt.show()
As we can see, the degree distribution follows a normal distribution such that $\langle{k}\rangle = K$, with K the parameter over which we have control. Now we can perform the dynamical process and see how the system behaves.
Firstly, let us consider the case where the system is exposed to a constant exogenous impulse
N = 5000 #Number of neurons
K = 10 #Mean degree
sigma = 1 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = build_adjacency_matrix(N, K, sigma)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplots(1,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.title("Instantaneous density of activated neurons", fontsize=15)
plt.show()
It can be noted that, after the constant impulse is applied (t=100), the system suffers a high activation state. Then neurons can change pulses by inner activations through connections, which makes the number of activated neurons oscilates and decays until a constant activity density (fraction of activated neurons) keeps a constant value. That means, under a constant stimuli, a fixed amount of neurons (with n=10 refractory states) oscilates between activate and deactivate
time_span = [i for i in range(T)]
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(440,490)
plt.ylim(0.05, 0.1)
plt.plot(time_span, instant_state_vect)
plt.show()
From the same process, we can focus on a small segment of the graphic and made clearer that the oscillation follows a pattern (could be better for several measurements of instant activity and taking the mean over all) with frequency of oscilation $\nu \approx \frac{1}{n}$, where n is the number of refractory states. For this case, the frequency is 0.1 ms$^{-1}$ or the period of oscillation is given by $T \approx 10$ ms. It does make sense, once after 10 ms (unit is arbitrary) the neurons that were activated are allowed to perceive another pulse from their neighbors (or from outside, since here we kept a constant random pulse)
For the following results, we will provide a constant random exogenous pulse for a limited time span and allow the system to self sustain. As we expected from the cited paper, the system must vary with the connectivity (i.e. the normalized degree, or the transit probability)
N = 500 #Number of neurons
K = 10 #Mean degree
sigma = 0.8 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
plt.figure(1, figsize=(15,10))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
for i in range(4):
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = build_adjacency_matrix(N, K, sigma)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(2,2,i+1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='$\\sigma$ = {}'.format(sigma))
plt.legend()
sigma+=0.2
plt.show()
Now, considering a bigger network (N = 5000 nodes)
N = 5000 #Number of neurons
K = 10 #Mean degree
sigma = 0.8 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
plt.figure(1, figsize=(15,10))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
for i in range(4):
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = build_adjacency_matrix(N, K, sigma)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(2,2,i+1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='$\\sigma$ = {}'.format(sigma))
plt.legend()
sigma+=0.2
plt.show()
Similar to the results from the paper (fortunatelly), there exists a behavior changing at $\sigma = \sigma_c = 1$, also called critical connectivity. This behavior consists in a phase transition for sustainability of activate neurons. It represents the lower value for transit probability separating regimes that are and that are not self sustained. (Rembering that the exogenous impulse is applied only for t $\in$ [100, 300] ms.)
It is worth mentioning that this phenomena only persists for large networks. For small ones, it does not happen as can be seen above.
For visual appreciation, we can draw the network from the adjacency matrix
G_neurons = nx.convert_matrix.from_numpy_matrix(adj_matrix)
pos_neur = nx.spring_layout(G_neurons)
plt.figure(figsize=(20,20))
nx.draw_networkx(G_neurons, pos_neur, node_size=10, width=.1, with_labels=False)
plt.show(True)
For the last considerations, we can vary the number of refractory states and see how this affects the sustainability of the system
N = 5000 #Number of neurons
K = 10 #Mean degree
sigma = 1.4 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 2 #Number of refractory states
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
for i in range(4):
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = build_adjacency_matrix(N, K, sigma)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.legend()
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(200,450)
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.grid(True)
plt.legend()
n+=4
plt.show()
We can conclude that the refractory states are negatively correlated to the instant fraction of activated neurons, that is the more states the neurons must follow until they regain the 'hability' to receive a activation the lower the fraction of activated neurons exists. (Also, for n > 10 refractory states, sometimes the exogenous impulse can not propagate and the signal fades away as soon as we turn off the external stimuli).
Now we can make the same analysis considering a neuronal network built from Power grid, C. elegans true neural network and R collaborators.
N = nx.number_of_nodes(G_power) #Number of neurons
K = 10 #Mean degree
sigma = 1 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_power)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
Firstly, let us keep the exogenous stimuli over the whole time span
time_span = [i for i in range(T)]
plt.subplots(1,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.title("Instantaneous density of activated neurons", fontsize=15)
plt.show()
time_span = [i for i in range(T)]
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(440,490)
plt.plot(time_span, instant_state_vect)
plt.show()
We can see that the network oscillates with period proportional to the refractory number of states (as before), however the oscillatory decay behavior lasts longer than the Erdos-Renyi generated graph, even though it also follows a tendency to converge to a fixed fraction of activated neurons.
Now, turning off the stimuli after 100ms of exposure, and since we have no control over the connectivity ($\sigma$, on the first graph), we can see how the number of refractory states alters the behavior.
N = nx.number_of_nodes(G_power) #Number of neurons
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 2 #Number of refractory states
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_power)
for i in range(4):
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.legend()
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(200,450)
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.grid(True)
plt.legend()
n+=4
plt.show()
The only regime that allows self sustainable behavior is for restless neurons. Because of that, we will consider the behavior for n=3, n=4 and n=5
N = nx.number_of_nodes(G_power) #Number of neurons
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 3 #Number of refractory states
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_power)
for i in range(4):
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.legend()
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(200,450)
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.grid(True)
plt.legend()
n+=1
plt.show()
We can conclude that the Power grid network does not support self sustainability for neuronal cascade, with input stimuli persisting for only a few ms after turned off
N = nx.number_of_nodes(G_celegans) #Number of neurons
print(N)
K = 10 #Mean degree
sigma = 1 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_celegans)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
Keeping the external stimuli
time_span = [i for i in range(T)]
plt.subplots(1,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.title("Instantaneous density of activated neurons", fontsize=15)
plt.show()
For this one, it is not expected that it can sustain the stimuli, once the network is too small
time_span = [i for i in range(T)]
plt.subplots(1,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.title("Instantaneous density of activated neurons", fontsize=15)
plt.show()
Therefore, we cannot conclude anything from this network
N = nx.number_of_nodes(G_protein) #Number of neurons
print(N)
K = 10 #Mean degree
sigma = 1 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_protein)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(440,490)
plt.plot(time_span, instant_state_vect)
plt.show()
And let us change the number of refractory states
N = nx.number_of_nodes(G_protein) #Number of neurons
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 2 #Number of refractory states
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_protein)
for i in range(4):
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.legend()
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(200,450)
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.grid(True)
plt.legend()
n+=1
plt.show()
This network also cannot maintain the stimuli, working only for n=2 (restless neuron).
N = nx.number_of_nodes(G_Rcolab) #Number of neurons
print(N)
K = 10 #Mean degree
sigma = 1 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_Rcolab)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(440,490)
plt.plot(time_span, instant_state_vect)
plt.show()
And, finally, for self sustainable trial
N = nx.number_of_nodes(G_Rcolab) #Number of neurons
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 2 #Number of refractory states
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_Rcolab)
for i in range(4):
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.legend()
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(200,450)
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.grid(True)
plt.legend()
n+=1
plt.show()
G_fly = nx.read_graphml('/home/jorge/Documents/redes_complexas/drosophila_medulla_1.graphml')
G_fly = nx.convert_node_labels_to_integers(G_fly, first_label=0)
labels = G_fly.nodes()
pos = nx.spring_layout(G_fly)
G_fly = G_fly.to_undirected()
Gcc = sorted(nx.connected_component_subgraphs(G_fly), key=len, reverse=True)
G_fly = Gcc[0]
Once we have not displayed this network (can be found at: https://neurodata.io/project/connectomes/), firstly we will draw it using both spring layout and kamada-kawai layout
plt.figure(figsize=(15,10))
nx.draw_networkx(G_fly, pos, node_size=10, width=.1, with_labels=False)
plt.show(True)
pos = nx.kamada_kawai_layout(G_fly)
plt.figure(figsize=(15,10))
nx.draw_networkx(G_fly, pos, node_size=10, width=.1, with_labels=False)
plt.show(True)
Now, we can perform the dynamical process
N = nx.number_of_nodes(G_fly) #Number of neurons
print(N)
K = 10 #Mean degree
sigma = 1 #connectivity
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 10 #Number of refractory states
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_fly)
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect)
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(440,490)
plt.plot(time_span, instant_state_vect)
plt.show()
Now, for a turning-off external stimuli
N = nx.number_of_nodes(G_Rcolab) #Number of neurons
r = 0.5 #Poisson parameter
T = 1000 #Time maximum limit
n = 2 #Number of refractory states
plt.figure(1, figsize=(15,5))
plt.suptitle("Instantaneous density of activated neurons", fontsize=20)
#Build vetor containing neurons with null state
states = state_vector(N)
#Build the adjacency matrix, kept fixed through all the dynamical process
adj_matrix = nx.adjacency_matrix(G_Rcolab)
for i in range(4):
#Exogenous pulse
#exogenous_pulse(N, estados, r)
#Time evolution for neurons' state vector
#neural_evolution(N, T, estados, adj_matrix, n)
#Time evolution for neuron's state vector
instant_state_vect = time_evolution(N, T, states, adj_matrix, n)
time_span = [i for i in range(T)]
plt.subplot(1,2,1)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.legend()
plt.subplot(1,2,2)
plt.ylabel("Instant activity $\\rho_t$")
plt.xlabel("Time (ms)")
plt.xlim(200,450)
plt.plot(time_span, instant_state_vect, label='n = {}'.format(n))
plt.grid(True)
plt.legend()
n+=1
plt.show()
We can conclude that the neuronal cascade only happens on networks with similar structer as the Erdos-Renyi random graph. It is slightly similar to Power grid, which happens to be the only one that can persists the external stimuli, even for a small fraction of time. Therefore the neuronal cascade can not be self-sustainable for othertypes of networks